home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Language/OS - Multiplatform Resource Library
/
LANGUAGE OS.iso
/
cocktail
/
docme.lha
/
doc.me
/
scangen.me
< prev
next >
Wrap
Text File
|
1992-09-25
|
64KB
|
2,181 lines
.\" use: pic | tbl | eqn | ditroff -me
.\"
.\" "@(#)bibmac.me 2.2 9/9/83";
.de IP
.ip \\$1 \\$2
..
.de LP
.lp
..
.\" @(#)bmac.std 2.2 9/9/83;
.\" standard format troff commands
.\" citation formatting strings
.ds [[ [
.ds ]] ]
.ds ], ,\|
.ds ]- -
.ds [. " \&
.ds .] .
.ds [, " \&
.ds ,] ,
.ds [? " \&
.ds ?] ?
.ds [: " \&
.ds :] :
.ds [; " \&
.ds ;] ;
.ds [! " \&
.ds !] !
.ds [" " \&
.ds "] \&"
.ds [' " \&
.ds '] '
.ds [< " \&
.ds >]
.\" reference formmating strings
.ds a] " \&
.ds b] , \&
.ds c] , \&
.ds n] "\& and \&
.ds m] "\& and \&
.ds p] .
.\" reference formmating macros
.de s[ \" start reference
.nh
.IP [\\*([F] 5m
..
.de e[ \" end reference
.[-
..
.de [] \" start to display collected references
.LP
..
.de ][ \" choose format
.ie !"\\*([J"" \{\
. ie !"\\*([V"" .nr t[ 1 \" journal
. el .nr t[ 5 \" conference paper
.\}
.el .ie !"\\*([B"" .nr t[ 3 \" article in book
.el .ie !"\\*([R"" .nr t[ 4 \" technical report
.el .ie !"\\*([I"" .nr t[ 2 \" book
.el .nr t[ 0 \" other
.\\n(t[[
..
.de 0[ \" other
.s[
.if !"\\*([A"" \\*([A\\c
.if !"\\*([T"" , \\*([T\\c
.if !"\\*([V"" , Vol. \\*([V\\c
.if !"\\*([O"" , \\*([O\\c
.if !"\\*([D"" , \\*([D\\c
\&.
.e[
..
.de 1[ \" journal article
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
\\fI\\*([J \\*([V\\fP\c
.if !"\\*([N"" ,\\*([N
.if !"\\*([D"" (\\*([D)\c
.if !"\\*([P"" , \\*([P\c
.if !"\\*([I"" , \\*([I\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de 2[ \" book
.s[
.ie !"\\*([A"" \\*([A,
.el .if !"\\*([E"" \{\
. ie \\n([E-1 \\*([E, eds.,
. el \\*([E, ed.,\}
.if !"\\*([T"" \\fI\\*([T\\fP,
.rm a[
.if !"\\*([I"" .ds a[ \\*([I
.if !"\\*([C"" \{\
. if !"\\*(a["" .as a[ , \\&
. as a[ \\*([C\}
.if !"\\*([D"" \{\
. if !"\\*(a["" .as a[ , \\&
. as a[ \\*([D\}
\\*(a[.
.if !"\\*([G"" Gov. ordering no. \\*([G.
.if !"\\*([O"" \\*([O.
.e[
..
.de 3[ \" article in book
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
in \\fI\\*([B\\fP\c
.if !"\\*([V"" , vol. \\*([V
.if !~\\*([E~~ \{\
. ie , \\n([E-1 \\*([E (editors)\c
. el , \\*([E (editor)\c\}
.if !"\\*([I"" , \\*([I\c
.if !"\\*([C"" , \\*([C\c
.if !"\\*([D"" , \\*([D\c
.if !"\\*([P"" , \\*([P\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de 4[ \" report
.s[
.if !"\\*([A"" \\*([A,
.if !~\\*([E~~ \{\
. ie \\n([E-1 \\*([E, editors.
. el \\*([E, editor.\}
\\*([T,
\\*([R\c
.if !"\\*([G"" \& (\\*([G)\c
.if !"\\*([I"" , \\*([I\c
.if !"\\*([C"" , \\*([C\c
.if !"\\*([D"" , \\*([D\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de 5[ \" conference paper
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
\\fI\\*([J\\fP,
.if !"\\*([C"" \\*([C,
.if !"\\*([D"" \\*([D\c
.if !"\\*([P"" , \\*([P\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de [- \" clean up after yourself
.rm [A [B [C [D
.rm [E [F [G
.rm [I [J [K
.rm [N [O [P
.rm [R [T
.rm [V [W
..
.\" @(#)bmac.std 2.2 8/24/83;
.\" standard format troff commands
.\" citation formatting strings
.ds [[ [
.ds ]] ]
.ds ], ,\|
.ds ]- -
.ds [. " \&
.ds .] .
.ds [, " \&
.ds ,] ,
.ds [< " \&
.ds >]
.\" reference formmating strings
.ds c] , \&
.ds n] "" and \&
.ds m] "" and \&
.ds a] " \&
.\" reference formmating macros
.de s[ \" start reference
.nh
.IP [\\*([F] 5m
..
.de e[ \" end reference
.[-
..
.de [] \" start to display collected references
.SH
References
.LP
..
.de ][ \" choose format
.ie !"\\*([J"" \{\
. ie !"\\*([V"" .nr t[ 1 \" journal
. el .nr t[ 5 \" conference paper
.\}
.el .ie !"\\*([B"" .nr t[ 3 \" article in book
.el .ie !"\\*([R"" .nr t[ 4 \" technical report
.el .ie !"\\*([I"" .nr t[ 2 \" book
.el .nr t[ 0 \" other
.\\n(t[[
..
.de 0[ \" other
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
.if !"\\*([O"" \\*([O\c
.if !"\\*([D"" , \\*([D\c
\&.
.e[
..
.de 1[ \" journal article
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
\\fI\\*([J \\*([V\\fP,
.if !"\\*([N"" \\*([N
.if !"\\*([D"" (\\*([D),
.if !"\\*([P"" \\*([P\c
.if !"\\*([I"" , \\*([I\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de 2[ \" book
.s[
.ie !"\\*([A"" \\*([A,
.el .if !"\\*([E"" \{\
. ie \\n([E-1 \\*([E, eds.,
. el \\*([E, ed.,\}
.if !"\\*([T"" \\fI\\*([T\\fP,
.rm a[
.if !"\\*([I"" .ds a[ \\*([I
.if !"\\*([C"" \{\
. if !"\\*(a["" .as a[ , \\&
. as a[ \\*([C\}
.if !"\\*([D"" \{\
. if !"\\*(a["" .as a[ , \\&
. as a[ \\*([D\}
\\*(a[.
.if !"\\*([G"" Gov. ordering no. \\*([G.
.if !"\\*([O"" \\*([O.
.e[
..
.de 3[ \" article in book
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
in \\fI\\*([B\\fP,
.if !"\\*([V"" vol. \\*([V,
.if !"\\*([E"" \\*([E (ed.),
.if !"\\*([I"" \\*([I,
.if !"\\*([C"" \\*([C,
.if !"\\*([D"" \\*([D\c
.if !"\\*([P"" , \\*([P\c
\\&.
.if !"\\*([O"" \\*([O.
.e[
..
.de 4[ \" report
.s[
.if !"\\*([A"" \\*([A,
\\*([T,
\\*([R\c
.if !"\\*([G"" \& (\\*([G)\c
.if !"\\*([I"" , \\*([I\c
.if !"\\*([C"" , \\*([C\c
.if !"\\*([D"" , \\*([D\c
\\&.
.if !"\\*([O"" , \\*([O.
.e[
..
.de 5[ \" conference paper
.s[
.if !"\\*([A"" \\*([A,
.if !"\\*([T"" \\*([T,
\\fI\\*([J\\fP,
.if !"\\*([C"" \\*([C\c
.if !"\\*([D"" , \\*([D\c
.if !"\\*([P"" , \\*([P\c
\\&.
.if !"\\*([O"" , \\*([O.
.e[
..
.de [- \" clean up after yourself
.rm [A [B [C [D
.rm [E [F [G
.rm [I [J [K
.rm [N [O [P
.rm [R [T
.rm [V [W
..
.if t \{ \
.pl 29.7c \" page length
.po 2.5c \" page offset (left margin)
.ll 16.5c \" line length
.lt 16.5c \" title length
.nr LL 16.5c
.nr )l 29.7c
.nr hm 2c
.nr $r 9 \" factor for vertical spacing
.nr $R \n($r
.sz 12 \" font size
.nr pp 12
.nr sp 12
.nr tp 12
.nr fp 10
.hc ~ \" hyphenation character
. \" Umlauts and sharp s
.ds A \(A:
.ds O \(O:
.ds U \(U:
.ds a \(a:
.ds o \(o:
.ds u \(u:
.ds s \(ss
. \" UMLAUT \*:u, etc.
.ds : \v'-0.6m'\h'(1u-(\\n(.fu%2u))*0.13m+0.06m'\z.\h'0.2m'\z.\h'-((1u-(\\n(.fu%2u))*0.13m+0.26m)'\v'0.6m'
.\}
.if n \{ \
.po 0 \" page offset (left margin)
.ll 78 \" line length
.lt 78 \" title length
.nr $r 4 \" factor for vertical spacing
.nr $R \n($r
.hc ~ \" hyphenation character
. \" Umlaute und scharfes s
.ds A Ae
.ds O Oe
.ds U Ue
.ds a ae
.ds o oe
.ds u ue
.ds s sz
.\}
.de _
\&\\$1\l'|0\(ul'\\$2
..
.de FT \" font for programs
.ft C
.sz -2
..
.de FR
.ft R
.sz +2
..
.de [] \" start to display collected references
.uh References
.lp
..
.de $0 \" collect table of contents
.(x
.ta 2c
.ie '\\$2'' \\$1
.el \\$2. \\$1
.)x
..
.de np
.nr $p +1
.ip \\n($p.
..
.de SH
.sp 0.5
.in -3
.r \\$1
.sp 0.5
.in +3
..
.de PP
.sp 0.5
..
.de IP
.ip \\$1 \\$2
..
.de I
.i \\$1
..
.de TH
..
.EQ
delim off
.EN
.de FR
.ft R
.sz +2
..
.b " "
.sp 1c
.ta 9c
.ft R
.sz 12
\l'17.1c'
.nf
Efficient Generation of
Table-Driven Scanners
J. Grosch
\l'17.1c'
.sp 12.5c
\l'17.1c'
.ft H
.nf
GESELLSCHAFT F\*UR MATHEMATIK
UND DATENVERARBEITUNG MBH
FORSCHUNGSSTELLE F\*UR
PROGRAMMSTRUKTUREN
AN DER UNIVERSIT\*AT KARLSRUHE
.r
\l'17.1c'
.bp
.\" oh ''Scanner Generation'%'
.\" eh ''Scanner Generation'%'
.nr % 0
.ce 99
.sz 20
.b " "
.sp 2
Project
.sp
.b "Compiler Generation"
.sp
.sz 12
\l'15c'
.sp
.sz 16
.b "Efficient Generation of Table-Driven Scanners"
or
.b "NFA to DFA Conversion in Practically Linear Time"
.sp 2
Josef Grosch
.sp 2
.sz 14
May 24, 1987
.sp
.sz 12
\l'15c'
.sp 2
Report No. 2
.sp 2
Copyright \(co 1987 GMD
.sp 2
Gesellschaft f\*ur Mathematik und Datenverarbeitung mbH
Forschungsstelle an der Universit\*at Karlsruhe
Vincenz-Prie\*snitz-Str. 1
D-7500 Karlsruhe
.ce 0
.bp
.de $p
.sp 1.5
.ne 2c
.ps 10
.if '\\$3'' \{\
.ce
.tr aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ
\fR\\$1\fP
.tr aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz
.\}
.if '\\$3'1' \{\
.ce
.tr aAbBcCdDeEfFgGhHiIjJkKlLmMnNoOpPqQrRsStTuUvVwWxXyYzZ
\fR\\$1\fP
.tr aabbccddeeffgghhiijjkkllmmnnooppqqrrssttuuvvwwxxyyzz
.\}
.if '\\$3'2' \{\
\fB\\$1\fP
.\}
.ps
.sp 0.1
..
.nr pp 10
.nr sp 10
.nr tp 10
.\" nr $r 5.3 \" factor for vertical spacing
.nr $r 12 \" factor for vertical spacing
.nr $R \n($r
.sz 10
.EQ
delim $$
gsize 10
gfont R
.EN
.ce 99
.sz 12
.b "Efficient Generation of Lexical Analysers"
.sz 10
.sp
J. Grosch
.i
GMD Forschungsstelle an der Universit\*at Karlsruhe
Vincenz-Prie\*snitz-Str. 1, D-7500 Karlsruhe, Germany
grosch@karlsruhe.gmd.de
.he '''%'
.bp 1
.b
SUMMARY
.ce 0
.lp
.b
General tools for the automatic generation of lexical analysers like
LEX\*([<\*([[Les75\*(]]\*(>] convert a specification consisting of a set of regular expressions
into a deterministic finite automaton. The main algorithms involved are
subset construction, state minimization, and table compression.
Even if these algorithms do not
show their worst case time behaviour they are still quite expensive. This
paper shows how to solve the problem in linear time for practical cases,
thus resulting in a significant speed-up. The idea is to combine the
automaton introduced by Aho and Corasick\*([<\*([[AhC75\*(]]\*(>] with the direct
computation of an efficient table representation. Besides the algorithm we
present experimental results of the scanner generator Rex\*([<\*([[Gro87a\*(]]\*(>]
which uses this technique.
.sp
.lp
KEY WORDS lexical analysis scanner generator
.sh 1 Introduction
.pp
Today, there exist several tools to generate lexical analysers like e.g.
LEX\*([<\*([[Les75\*(]]\*(>], flex\*([<\*([[Pax88\*(]]\*(>], GLA\*([<\*([[Heu86\*(],Wai86\*(]]\*(>],
ALEX\*([<\*([[M\*os86\*(]]\*(>], or Mkscan\*([<\*([[HoL87\*(]]\*(>]. This
indicates that the automatic implementation of scanners becomes accepted by
today's compiler writers. In the past, the low performance of generated
scanners in comparison to hand-written ones may have restricted the
generation approach to prototyping applications. However, since newer
results\*([<\*([[Gro87b\*(],Heu86\*(],Wai86\*(]]\*(>] show that generated
scanners have reached the speed of hand-written ones
there is no argument against using automatically generated lexical analysers
in production compilers.
In the following we present an efficient algorithm to convert a scanner
specification based on regular expressions into a deterministic finite
automaton.
.\" .pp
.\" In compiler construction the run time of a scanner generator is not a crucial
.\" problem. But following the "swiss army knife"
.\" idea .[aho ullman principles 1977]. a
.\" scanner generator can be used for a wide range of applications and thus
.\" become a frequently used tool. In this case the generation time becomes
.\" important because nobody would accept a turn around time of e.g. 10 minutes
.\" and usually some iterations are needed to complete a task.
.pp
A specification of a scanner consists of a set of regular expressions (REs).
Each RE usually describes a deterministic finite automaton (DFA) being able
to recognize a token. The whole set of REs, however, usually specifies a
nondeterministic finite automaton (NFA). To allow scanning to be done in
linear time the NFA has to be converted into a DFA. This problem can be
solved with well known algorithms for subset construction and state
minimization\*([<\*([[ASU86\*(],WaG84\*(]]\*(>].
In the worst case subset construction takes time $ O ( 2 sup n ) $ and
state minimization $ O ( n sup 2 ) $ or
$ O ( n~log~n ) $ if n is the number of states.
If the DFA is implemented as a table-driven automaton an additional
algorithm for table-compression is needed in practice, usually taking time
$ O ( n sup 2 ) $.
.(b
.vs 12
Running example:
.sp 0.5
.FT
letter ( letter | digit ) * \(-> IdentSymbol
digit + \(-> DecimalSymbol
octdigit + Q \(-> OctalSymbol
BEGIN \(-> BeginSymbol
END \(-> EndSymbol
:= \(-> AssignSymbol
.)b
.(z
.PS
circlerad = circlerad/2
N0: circle "0"
arrow "0-7" above
N3: circle "3"
arrow "Q" above
circle "4"
A3: arc from N3.e to N3.s at N3 + (arcrad * 0.7, - arcrad * 0.7) cw ->
"0-7" at A3.e + (arcrad, 0) ljust
N5: circle "5" at N3 - (0, linewid)
arrow " E" above
circle "6"
arrow "G" above
circle "7"
arrow "I" above
circle "8"
arrow "N" above
circle "9"
N2: circle "2" at N3 + (0, linewid)
A2: arc from N2.ne to N2.se at N2 + (arcrad, 0) cw <-
"0-9" at A2.e + (arcrad, 0) ljust
N1: circle "1" at N2 + (0, linewid)
A1: arc from N1.ne to N1.se at N1 + (arcrad, 0) cw <-
"A-Z,a-z,0-9" at A1.e + (arcrad, 0) ljust
N10: circle "10" at N5 - (0, linewid)
arrow "N" above
circle "11"
arrow "D" above
circle "12"
N13: circle "13" at N10 - (0, linewid)
arrow "=" above
circle "14"
arrow "A-Z,a-z " rjust from N0 to N1 chop
arrow " 0-9" ljust from N0 to N2 chop
arrow "B" above from N0 to N5 chop
arrow " E" above from N0 to N10 chop
arrow " :" above from N0 to N13 chop
.PE
.sp
Fig. 1: NFA for the running example
.)z
.(z
.PS
N0: circle "0"
line invis
DUMMY: circle invis
N5: circle "5" at DUMMY - (0, linewid * 2)
arrow "E" above
N6: circle "6"
arrow "G" above
N7: circle "7"
arrow "I" above
N8: circle "8"
arrow "N" above
N9: circle "9"
N13: circle "13" at N5 - (0, linewid)
arrow "=" above
N14: circle "14"
N10: circle "10" at DUMMY + (0, linewid * 2)
arrow "N" above
N11: circle "11"
arrow "D" above
N12: circle "12"
N3: circle "3" at N10 + (0, linewid)
arrow "Q" above
circle "4"
A3: arc from N3.e to N3.s at N3 + (arcrad * 0.7, - arcrad * 0.7) cw ->
"0-7" at A3.e + (arcrad, 0) ljust
N2: circle "2" at N3 + (0, linewid)
A2: arc from N2.ne to N2.se at N2 + (arcrad, 0) cw <-
"0-9" at A2.e + (arcrad, 0) ljust
N1: circle "1" at N7 + (0, linewid * 2)
A1: arc from N1.e to N1.n at N1 + (arcrad * 0.7, arcrad * 0.7) ->
"A-Z,a-z,0-9" at A1.e + (arcrad, 0) ljust
arrow " 8-9" ljust from N3 to N2 chop
arrow "8-9 " rjust from N0 to N2 chop
arrow "0-7" from N0 to N3 chop
arrow " E" ljust from N0 to N10 chop
arrow "A,C,D,F-Z,a-z" above from N0 to N1 chop
arrow " B" ljust from N0 to N5 chop
arrow " :" ljust from N0 to N13 chop
arrow " \(*a\\\\\\\\\\\\\\\\N" ljust from N10 to N1 chop
arrow " \(*a\\\\\\\\\\\\\\\\D" ljust from N11 to N1 chop
arrow " \(*a" ljust from N12 to N1 chop
arrow "\(*a\\\\\\\\\\\\\\\\E " rjust from N5 to N1 chop
arrow "\(*a\\\\\\\\\\\\\\\\G" rjust from N6 to N1 chop
arrow " \(*a\\\\\\\\\\\\\\\\I" ljust from N7 to N1 chop
arrow " \(*a\\\\\\\\\\\\\\\\N" ljust from N8 to N1 chop
arrow " \(*a" ljust from N9 to N1 chop
.PE
.sp
Fig. 2: DFA for the running example (\(*a \\\\\\\\ \(*b stands for A-Z,a-z,0-9 except \(*b)
.)z
.pp
The above specification describes six tokens each by a RE using an abstract
notation.
The automaton for these tokens is a NFA whose graphical representation is
shown in Figure 1.
The conversion of this NFA to a DFA yields the automaton shown in Figure 2.
.sp 0.5
.ne 2c
.lp
.b "Definition 1:"
constant RE
.pp
A RE consisting only of the concatenation of characters is called a
.i constant
RE. A constant RE contains no operators like + * | ?
.pp
The language of a constant RE contains exactly one word. In the above example
the constant REs are:
.(b
.FT
BEGIN END :=
.)b
.pp
During the construction of tables for a DFA by hand we observed that the
task was solvable easily and in linear time for constant REs. Common prefixes
have simply to be factored out thus arriving at a DFA having a decision tree
as its graphical representation. Only the few non-constant REs required real
work.
.pp
Scanner specifications for languages like Modula or C usually contain 90%
constant REs: 60% keywords and 30% delimiters.
Only 10% are non-constant REs needed to specify identifiers, various
constants, and comments (see Appendix 2 for an example).
The languages Ada and Pascal are exceptions from
this observation because keywords can be written in upper or lower case
letters or in any mixture.
.pp
It would be nice to retain the linear time
behaviour for constant REs and perform subset construction and state
minimization only for the few non-constant REs.
The following chapter describes how this can be achieved by first converting
the NFA of the non-constant REs to a DFA and incrementally adding the
constant REs afterwards.
The results of this method are shown for the running example.
Then we generalize the method to automata with several start states. We
conclude with a comparison of some scanner generators and present
experimental results.
.sh 1 "The Method"
.pp
The basic idea is to combine the special automaton of Aho and
Corasick\*([<\*([[AhC75\*(]]\*(>] with the so-called "comb-vector" or row
displacement
technique\*([<\*([[ASU86\*(]]\*(>] for the table representation. The automaton of
Aho and Corasick, called a pattern matching machine, extends a usual DFA by a
so-called failure function and was originally used to search for overlapping
occurrences of several strings in a given text. This automaton can also be
used to cope with a certain amount of nondeterminism.
.pp
To introduce the terminology needed we present a definition of our version
of the automaton. We call it a
.i "tunnel automaton"
and the tunnel function used corresponds to the
failure function of Aho and Corasick.
The name tunnel automaton is motivated by the analogy to the tunnel effect
from nuclear physics: Electrons can switch over to another orbit without
supply of energy and the tunnel automaton can change its state without
consumption of an input symbol.
.sp 0.5
.ne 2c
.lp
.b "Definition 2:"
tunnel automaton
.pp
A
.i "tunnel automaton"
is an extension of a DFA and consists of:
.TS
;
l l l.
StateSet a finite set of states
FinalStates a set of final states FinalStates \(ib StateSet
StartState a distinguished start state StartState \(mo StateSet
Vocabulary a finite set of input symbols
Control the transition function Control: StateSet \(mu Vocabulary \(-> StateSet
NoState a special state to denote undefined NoState \(mo StateSet
Semantics a function mapping the states Semantics: StateSet $ ->~2 sup Proc $
to subsets of a set of procedures
Tunnel a function mapping states to states Tunnel: StateSet \(-> StateSet
.TE
.pp
A tunnel automaton usually works like a DFA: in each step it consumes a
symbol and performs a state transition. Additionally the tunnel automaton is
allowed to change from state $ s $ to state $ t~=~Tunnel~( s ) $ without
consuming a symbol, if no transition is possible in state $ s $. In state
$ t $ the same rules apply, so the automaton may change the state several
times without consuming any symbols.
After recognizing a token a semantic procedure associated with the final
state is executed.
Algorithm 1 formalizes the behaviour of a tunnel automaton.
To guarantee the termination of the WHILE loop the StateStack is initialized
with a special final state called DefaultState.
.sp 0.5
.(b L
.vs 12
\fBAlgorithm 1\fR: tunnel automaton
.sp 0.5
.FT
BEGIN
Push (StateStack , DefaultState)
Push (SymbolStack, Dummy )
State := StartState
Symbol := NextSymbol ()
LOOP
IF Control (State, Symbol) \(!= NoState THEN
State := Control (State, Symbol)
Push (StateStack , State )
Push (SymbolStack, Symbol)
Symbol := NextSymbol ()
ELSE
State := Tunnel (State)
IF State = NoState THEN EXIT END
END
END
WHILE NOT Top (StateStack) \(mo FinalStates DO
State := Pop (StateStack )
UnRead (Pop (SymbolStack))
END
Semantics (Top (StateStack)) ()
END
.)b
.sp 0.5
.pp
Before we present the algorithm to compute the control function
we need some more definitions:
.sp 0.5
.ne 2c
.lp
.b "Definition 3:"
trace
.pp
The
.i trace
of a string is the sequence of states that a tunnel automaton passes
through given the string as input. States at which the automaton does not
consume a symbol are
omitted. This includes the start state. If necessary we pad the trace with
the value NoState (denoted by the character -) to adjust its length to the
length of the string.
.(b
.\" vs 12
Examples (computed by using the DFA of Fig. 2 as tunnel automaton):
.TS
;
l l l l.
The trace of IF is 1 1
The trace of ENDIF is 10 11 12 1 1
The trace of 1789 is 3 3 2 2
The trace of 777Q is 3 3 3 4
The trace of ::= is 13 - -
.TE
.)b
.ne 2c
.lp
.b "Definition 4:"
ambiguous state
.pp
We call a state $ s $
.i ambiguous
(or ambiguously reachable)
if there exist more than one string such that
for each string the repeated execution of the control function (first loop
in Algorithm 1) ends up in state $ s $.
.pp
Example: The states 1, 2, 3, and 4 of the DFA of Fig. 2 are ambiguous
states.
.sp
.lp
.b Method:
Construction of a tunnel automaton from a given set of regular expressions
.ip "Step 1:" 2c
Convert the NFA for non-constant REs to a DFA using the usual algorithms for
subset construction and state minimization\*([<\*([[ASU86\*(],WaG84\*(]]\*(>].
.ip "Step 2:" 2c
Extend the DFA to a tunnel automaton by setting $ Tunnel~( s ) $ to
$ NoState $ for every state $ s $.
.ip "Step 3:" 2c
Compute the set of ambiguous states of the tunnel automaton.
For convenience Appendix 1 contains an algorithm to compute the
ambiguous states.
.ip "Step 4:" 2c
For every constant RE execute Step 5 which incrementally extends the tunnel
automaton. Continue with Step 6.
.ip "Step 5:" 2c
Compute the trace of the string specified by the constant RE using the
current tunnel automaton. We have to distinguish 4 cases:
.ip "Case 1:" 2c
The trace contains neither NoState nor ambiguous states:
.\sp 0.5
.(l
.PS
dist = 0.7
T1: "Let the trace be" ljust
line right 1.5i invis
S1: circle "$ s sub 1 $"
arrow
S2: circle "$ s sub 2 $"
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Sn: circle "$ s sub n $"
"Let the RE be" ljust at T1 + (0, linewid * dist)
A1: "$ a sub 1 $" at S1 + (0, linewid * dist)
A2: "$ a sub 2 $" at S2 + (0, linewid * dist)
An: "$ a sub n $" at Sn + (0, linewid * dist)
"..." at 0.5 between A2 and An
.PE
.)l
.\sp 0.5
.ip " " 2c
As the path for the RE already exists include (add) the semantics of RE to
the semantics of $ s sub n $ and include $ s sub n $ into the set of final
states.
.ip "Case 2:" 2c
The trace does not contain ambiguous states but it contains NoState:
.\sp 0.5
.(l
.PS
T1: "Let the trace be" ljust
line right 1.5i invis
S1: circle "$ s sub 1 $"
arrow
S2: circle "$ s sub 2 $"
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Si: circle "$ s sub i $"
"New States" ljust at T1 - (0, linewid)
Zi1: circle "$ z sub i+1 $" at Si + (linewid, -linewid)
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Zn: circle "$ z sub n $"
Si1: "-" at Zi1 + (0, linewid)
Sn: "-" at Zn + (0, linewid)
arrow from Si to Zi1 chop
"Let the RE be" ljust at T1 + (0, linewid * dist)
A1: "$ a sub 1 $" at S1 + (0, linewid * dist)
A2: "$ a sub 2 $" at S2 + (0, linewid * dist)
Ai: "$ a sub i $" at Si + (0, linewid * dist)
Ai1: "$ a sub i+1 $" at Si1 + (0, linewid * dist)
An: "$ a sub n $" at Sn + (0, linewid * dist)
"..." at 0.5 between A2 and Ai
"..." at 0.5 between Ai1 and An
"..." at 0.5 between Si1 and Sn
.PE
.)l
.\sp 0.5
.ip " " 2c
Let $ s sub i $ be the last state before NoState.
Extend the path $ s sub 1 ,~... ,~s sub i $ by newly created states
$ z sub i+1 , ... ,~z sub n $. Extend the control function to pass through the
states $ z sub i+1 , ... ,~z sub n $ given the input
$ a sub i+1 , ... ,~a sub n $ starting from state $ s sub i $. Set the
semantics of the states $ z sub i+1 , ... ,~z sub n-1 $ to the empty set.
Set the semantics of $ z sub n$ to the semantics of RE and include
$ z sub n $ into the set of final states. Set the tunnel function of
$ z sub i+1 , ... ,~z sub n $ to NoState.
.ip "Case 3:" 2c
The trace does not contain NoState but it contains ambiguous states:
.\sp 0.5
.(l
.PS
T1: "Let the trace be" ljust
line right 1.5i invis
S1: circle "$ s sub 1 $"
arrow
S2: circle "$ s sub 2 $"
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Si1: circle "$ s sub i-1 $"
arrow dashed
Si: circle "$ s sub i $"
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Sn: circle "$ s sub n $"
"New States" ljust at T1 - (0, linewid)
Zi: circle "$ z sub i $" at Si - (0, linewid)
arrow right 0.2i
line right 0.3i invis "..."
arrow right 0.2i
Zn: circle "$ z sub n $"
arrow from Si1 to Zi chop
arrow from Zi to Si chop dotted
arrow from Zn to Sn chop dotted
"Let the RE be" ljust at T1 + (0, linewid * dist)
A1: "$ a sub 1 $" at S1 + (0, linewid * dist)
A2: "$ a sub 2 $" at S2 + (0, linewid * dist)
Ai1: "$ a sub i-1 $" at Si1 + (0, linewid * dist)
Ai: "$ a sub i $" at Si + (0, linewid * dist)
An: "$ a sub n $" at Sn + (0, linewid * dist)
"..." at 0.5 between A2 and Ai1
"..." at 0.5 between Ai and An
.PE
.)l
.\sp 0.5
.ip " " 2c
Let $ s sub i $ be the first ambiguous state in the trace.
Create new states $ z sub i , ... ,~z sub n $ and
extend the control function to pass through the
states $ z sub i , ... ,~z sub n $ given the input
$ a sub i , ... ,~a sub n $ starting from state $ s sub i-1 $. Note, that the
transition from $ s sub i-1 $ to $ s sub i $ on input $ a sub i $ is lost
this way. Set the
semantics of the states $ z sub i , ... ,~z sub n $ to the semantics of the
corresponding states $ s sub i , ... ,~s sub n $.
Include the semantics of RE to the semantics of $ z sub n $ and include
$ z sub n $ into the set of final states. Set the tunnel function of
$ z sub i , ... ,~z sub n $ to the corresponding states
$ s sub i , ... ,~s sub n $.
.ip "Case 4:" 2c
The trace contains ambiguous states as well as NoState:
.\sp 0.5
.lp
.PS
T1: "Let the trace be" ljust
line right 1.5i invis
S1: circle "$ s sub 1 $"
arrow
S2: circle "$ s sub 2 $"
arrow right 0.2i
line right 0.2i invis "..."
arrow right 0.2i
Si1: circle "$ s sub i-1 $"
arrow dashed
Si: circle "$ s sub i $"
arrow right 0.2i
line right 0.2i invis "..."
arrow right 0.2i
Sj: circle "$ s sub j $"
"New States" ljust at T1 - (0, linewid)
Zi: circle "$ z sub i $" at Si - (0, linewid)
arrow right 0.2i
line right 0.2i invis "..."
arrow right 0.2i
Zj: circle "$ z sub j $"
arrow
Zj1: circle "$ z sub j+1 $"
arrow right 0.2i
line right 0.2i invis "..."
arrow right 0.2i
Zn: circle "$ z sub n $"
Sj1: "-" at Zj1 + (0, linewid)
Sn: "-" at Zn + (0, linewid)
arrow from Si1 to Zi chop
arrow from Zi to Si chop dotted
arrow from Zj to Sj chop dotted
"Let the RE be" ljust at T1 + (0, linewid * dist)
A1: "$ a sub 1 $" at S1 + (0, linewid * dist)
A2: "$ a sub 2 $" at S2 + (0, linewid * dist)
Ai1: "$ a sub i-1 $" at Si1 + (0, linewid * dist)
Ai: "$ a sub i $" at Si + (0, linewid * dist)
Aj: "$ a sub j $" at Sj + (0, linewid * dist)
Aj1: "$ a sub j+1 $" at Sj1 + (0, linewid * dist)
An: "$ a sub n $" at Sn + (0, linewid * dist)
"..." at 0.5 between A2 and Ai1
"..." at 0.5 between Ai and Aj
"..." at 0.5 between Aj1 and An
"..." at 0.5 between Sj1 and Sn
.PE
.\sp 0.5
.ip " " 2c
Let $ s sub i $ be the first ambiguous state in the trace.
Let $ s sub j $ be the last state before NoState.
Create new states $ z sub i , ... ,~z sub n $ and
extend the control function to pass through the
states $ z sub i , ... ,~z sub n $ given the input
$ a sub i , ... ,~a sub n $ starting from state $ s sub i-1 $. Note again
that the
transition from $ s sub i-1 $ to $ s sub i $ on input $ a sub i $ is lost
this way. Set the
semantics of the states $ z sub i , ... ,~z sub j $ to the semantics of the
corresponding states $ s sub i , ... ,~s sub j $.
Set the
semantics of the states $ z sub j+1 , ... ,~z sub n-1 $ to the empty set.
Set the semantics of $ z sub n $ to the semantics of RE and
include
$ z sub n $ into the set of final states. Set the tunnel function of
$ z sub i , ... ,~z sub j $ to the corresponding states
$ s sub i , ... ,~s sub j $ and set the tunnel function of the states
$ z sub j+1 , ... ,~z sub n $ to NoState.
.ip "Step 6:" 2c
If an unambiguous semantic function is desired convert
.(b
Semantics (s) = { $ p sub 1 ,~... ,~p sub n $ } to Semantics (s) = { $ p sub i $ }
.)b
.ip " " 2c
for all states s. We assume the procedures $ p sub 1 ,~... ,~p sub n $ to be
ordered totally with $ p sub i $ being the maximal (see below) procedure of
$ p sub 1 ,~... ,~p sub n $.
.sp 2
.lp
.(z L
.vs 12
\fBAlgorithm 2\fR: extend a tunnel automaton to recognize an additional constant RE
.sp 0.5
.FT
BEGIN
State := StartState
Symbol := NextSymbol () /* reading the string specified by the constant RE */
.sp 0.5
/* trace and do nothing */
.sp 0.5
LOOP
IF Control (State, Symbol) = NoState OR
Symbol = EndOfInput OR
Control (State, Symbol) \(mo AmbiguousStates THEN EXIT END
State := Control (State, Symbol) /* trace */
Symbol := NextSymbol ()
END
PreviousState := State
.sp 0.5
/* trace and duplicate the path */
.sp 0.5
LOOP
IF Control (State, Symbol) \(!= NoState THEN
IF Symbol = EndOfInput THEN EXIT END
NewState := CreateState ()
State := Control (State, Symbol) /* trace */
Control (PreviousState, Symbol) := NewState
Semantics (NewState) := Semantics (State)
Tunnel (NewState) := State
PreviousState := NewState
Symbol := NextSymbol ()
ELSE
IF Tunnel (State) = NoState THEN EXIT END
State := Tunnel (State)
END
END
.sp 0.5
/* extend the path */
.sp 0.5
WHILE Symbol \(!= EndOfInput DO
NewState := CreateState ()
Control (PreviousState, Symbol) := NewState
Semantics (NewState) := \(O/
Tunnel (NewState) := NoState
PreviousState := NewState
Symbol := NextSymbol ()
END
.sp 0.5
/* process new final state */
.sp 0.5
FinalStates := FinalStates \(cu { PreviousState }
Semantics (PreviousState) := Semantics (PreviousState) \(cu Semantics (RE)
END
.)z
.pp
Algorithm 2 describes step 5 more precisely.
The problem of an ambiguous semantic function arises already in the
running example, as e.g. the string END matches both the REs for IdentSymbol
and EndSymbol.
Therefore state 12 of Fig. 2 would be associated with two semantic
procedures. The generators LEX and Rex for example turn the semantic
function into an unambiguous one by
considering the sequence of the REs in the given specification.
The REs and their associated
semantic actions are mapped to priorities in descending order. In case of
conflicts the semantic action with greatest priority is preferred. In other
words the procedures are ordered totally and the "maximal procedure" out of
several ones is selected.
.pp
A tunnel automaton extended using Algorithm 2 works correctly because of the
following reasons.
The constant REs are recognized correctly because of the construction used.
We construct a decision tree without any ambiguous states.
.pp
The non-constant REs are recognized correctly because: If the tunnel
automaton stops at a newly created state we have propagated the semantics of
the original final state to the new one. Otherwise the tunnel automaton uses
at most one tunnel transition and stops at exactly the same state as it
would have stopped at before. That is because of the construction of the tunnel
function. As we did not change the semantic function of previously existing
states, except in the case where we could use the state as final state of a
constant RE, the automaton behaves as before.
.pp
We call a tunnel automaton
.i minimal
if it has the minimal number of states to do its job, e. g. it must be able
to distinguish between different tokens using separate final states.
Without a formal proof we state that Algorithm 2 constructs a minimal
automaton if the given DFA was minimal.
The reason is that a tunnel automaton has the same number of states and
the same "structure" as a corresponding DFA except that many regular
transitions are replaced by a few tunnel transitions.
Compare for example the Figures 2 and 3.
.sh 1 Example
.pp
Algorithm 2 constructs for the running example the tunnel automaton depicted
in Fig. 3. The dotted arrows denote tunnel transitions.
The automaton in Fig. 3 is graphically similar to the one in Fig. 2: most of
the transitions leading to state 1 have been replaced by tunnel transitions.
However, note the difference: whereas a solid arrow of Fig. 2 stands for a
big set of transitions a dotted arrow of Fig. 3 denotes a single tunnel
transition, only. This is the key to the efficient representation of the
control and tunnel functions. As the control function corresponds to a
sparse matrix it can advantageously be implemented using the comb-vector
technique\*([<\*([[ASU86\*(]]\*(>]. The rows of a matrix are merged
into a single vector called Next. Two additional vectors called Base and
Check are necessary to accomplish the access of the original data.
The resulting data structure resembles
the merging of several combs into one. In combination with the above a
fourth array called Tunnel is used to compress the
table even more. This array corresponds directly to our tunnel function.
Fig. 4 shows some entries of the comb-vector for the running example. The
excerpt is restricted to the characters E, N, D.
.sp 0.5
.(b L
.sp
.PS
N0: circle "0"
line invis
DUMMY: circle invis
N5: circle "5" at DUMMY - (0, linewid * 2)
arrow "E" above
N6: circle "6"
arrow "G" above
N7: circle "7"
arrow "I" above
N8: circle "8"
arrow "N" above
N9: circle "9"
N13: circle "13" at N5 - (0, linewid)
arrow "=" above
N14: circle "14"
N10: circle "10" at DUMMY + (0, linewid * 2)
arrow "N" above
N11: circle "11"
arrow "D" above
N12: circle "12"
N3: circle "3" at N10 + (0, linewid)
arrow "Q" above
circle "4"
A3: arc from N3.e to N3.s at N3 + (arcrad * 0.7, - arcrad * 0.7) cw ->
"0-7" at A3.e + (arcrad, 0) ljust
N2: circle "2" at N3 + (0, linewid)
A2: arc from N2.ne to N2.se at N2 + (arcrad, 0) cw <-
"0-9" at A2.e + (arcrad, 0) ljust
N1: circle "1" at N7 + (0, linewid * 2)
A1: arc from N1.e to N1.n at N1 + (arcrad * 0.7, arcrad * 0.7) ->
"A-Z,a-z,0-9" at A1.e + (arcrad, 0) ljust
arrow " 8-9" ljust from N3 to N2 chop
arrow "8-9 " rjust from N0 to N2 chop
arrow "0-7" from N0 to N3 chop
arrow " E" ljust from N0 to N10 chop
arrow "A,C,D,F-Z,a-z" above from N0 to N1 chop
arrow " B" ljust from N0 to N5 chop
arrow " :" ljust from N0 to N13 chop
arrow from N10 to N1 chop dotted
arrow from N11 to N1 chop dotted
arrow from N12 to N1 chop dotted
arrow from N5 to N1 chop dotted
arrow from N6 to N1 chop dotted
arrow from N7 to N1 chop dotted
arrow from N8 to N1 chop dotted
arrow from N9 to N1 chop dotted
.PE
.sp
Fig. 3: tunnel automaton for the running example
.)b
.(z L
.TS
tab (;) center;
l c c c c r r r r r c r r r c
l | c | c | c | c | r | r | r | r | r | c | r | r | r | c
l | c | c | c | c | r | r | r | r | r | c | r | r | r | c.
;0;1;2;...;68;69;70;71;72;...;78;79;80;
_
Check;-;-;-;...; 0; 0; 1; 1;11;...; 0;10; 1;...
_
Next ;-;-;-;...; 1;10; 1; 1;12;...; 1;11; 1;...
_
.TE
.TS
tab (;) center;
l r r c r r r c
l | r | r | c | r | r | r | c
l | r | r | c | r | r | r | c.
State ;0;1;...;10;11;12
_
Base ;0;2;...; 1; 4; 0;...
_
Tunnel;-;-;...; 1; 1; 1;...
_
.TE
.sp
.FT
Control (State, Symbol) := IF Check [Base [State] + Symbol] = State
THEN Next [Base [State] + Symbol]
ELSE NoState
.FR
.sp
.ce 2
Fig. 4: comb vector data structure for the running example and access procedure
(for characters E, N, D only; ASCII codes: E=69, D=68, N=78)
.)z
.pp
An advantage of Algorithm 2 is that this data structure is computed in part
directly from the set of REs. There is no need to compute a complete
classical DFA first and to transform it into the above data structure using
time and space consuming table compression algorithms afterwards.
The construction of the comb-vector consists primarily of the computation of
the Base and the Tunnel values. For each state the Base value is determined
by a simple search algorithm which shows reasonable results and performance.
A clever computation of the Tunnel values is essential for a good table
compression. Its complexity is $ O ( n sup 2 c ) $ where n is the number of
states of the DFA and c is the cardinality of the character set. For each
state an other state has to be determined which can safely act as "Tunnel
state" and which saves a maximal number of transitions. It is sufficient to
perform this expensive algorithm for the few states created for non-constant
REs only. The Tunnel values for the states created for constant REs are
determined by Algorithm 2 in linear time.
.sh 1 "Several Start States"
.pp
Scanner generators like LEX and Rex offer the feature of so-called start
states to handle left context. In each start state a different set of REs
may be activated and there are special statements to change the current start
state. A scanner specification using several start
states is turned into an automaton with several start states.
Under these circumstances the construction algorithm for a tunnel automaton
becomes a little bit more complex.
We only mention the problems that arise from this extension.
.\" .sh 2 "Computation of Ambiguous States"
.pp
First we have to refine the computation of ambiguous states. Definition 4
is still valid saying that an ambiguous state must be reachable with more than one
string. Now we have to take into account that strings can be processed
starting from several start states. The difference of the refined algorithm
lies in the treatment of the states being direct successors of the start
states. Such a state becomes ambiguous if there exist more than one
transition from one start state. It is not sufficient if there are several
transitions but each one originates in a different start state.
.\" Appendix 2 gives the refined algorithm.
.\" .sh 2 "Definition of Start Sets"
.pp
If a constant RE should be recognized in a set of start states S we would
like to construct only one sequence of states which can be reached from all
members of S. This is because we want to create as few states as possible.
However, we have to be careful, because the trace of the RE could lead to a
state which can be reached from a start state t not contained in S. This
would erroneously allow the recognition of the RE also from start state t.
Therefore we have to construct several sequences of states or we have to
branch off an existing sequence in this case. To be able to do this we have
to know the set of start states a state can be reached from.
.\" .ne 2c
.\" .lp
.\" .b "Definition 5:"
.\" start set
.\" .pp
.\" The
.\" .i "start set"
.\" of a state s is the set of start states, from which s is
.\" reachable by a path of transitions.
.\" .pp
Step 5 of the construction of a tunnel automaton has to be extended not only
to consider ambiguous states but also to check whether the set of start
states S of the RE is a proper subset of the set of start states a state
in the trace can be reached from.
In this case a side branch with newly created states is necessary.
.\" .sh 2 "Recording Traces"
.pp
We extend a tunnel automaton to recognize a RE once for each corresponding
start state. Our aim is to use the same sequence of states constructed for
the first start state, but we have to check whether this is safely possible.
To be able to reuse a sequence of states we have to record it. For every
character (index) and every associated state t from the traces of the RE we
record the state actually used in the tunnel automaton as target state of
the current transition. This is either the state t or a newly created one.
.\" .pp
The construction method is now extended in the following way. For each
character index i of the RE the state t from the trace is examined. Whenever
possible the state recorded for i and t is reused.
.if 0 \{\
.sh 2 "Algorithm"
.pp
Algorithm 3 extends Algorithm 2 to handle several start
states. It has to be executed once for each start state a RE should be
recognized in. The required input consists of:
.(b
.\" vs 12
.TS
;
l l.
RE constant RE (string) accessed by function NextSymbol
StartState current start state for RE
StartStates complete set of start states for RE
StartSet mapping: StateSet \(-> $ 2 sup StartStates $
(precomputed according to Definition 5)
.TE
.)b
.sp 0.5
.(l L
.vs 12
\fBAlgorithm 3\fR: extend a tunnel automaton to recognize an additional constant RE
.sp 0.5
.FT
BEGIN
State := StartState
Symbol := NextSymbol ()
Index := 1
PreviousState := State
.sp 0.5
/* trace and reuse existing path */
.sp 0.5
LOOP
State := Control (State, Symbol) /* trace */
IF Symbol = EndOfInput OR
State = NoState OR
State \(mo AmbiguousStates OR
StartStates \(sb StartSet (State) THEN EXIT END
NewState := Record (Index, State)
IF NewState \(!= NoState THEN
Control (PreviousState, Symbol) := NewState
ELSE
NewState := State
Record (Index, State) := NewState
END
StartSet (NewState) := StartSet (NewState) \(cu { StartState }
PreviousState := NewState
Symbol := NextSymbol ()
Index +:= 1
END
.sp 0.5
/* trace and duplicate the path */
.sp 0.5
LOOP
IF Symbol = EndOfInput THEN EXIT END
State := Control (State, Symbol) /* trace */
IF State \(!= NoState THEN
NewState := Record (Index, State)
IF NewState = NoState THEN
NewState := CreateState ()
Semantics (NewState) := Semantics (State)
Tunnel (NewState) := State
Record (Index, State) := NewState
END
Control (PreviousState, Symbol) := NewState
StartSet (NewState) := StartSet (NewState) \(cu { StartState }
PreviousState := NewState
Symbol := NextSymbol ()
Index +:= 1
ELSE
IF Tunnel (State) = NoState THEN EXIT END
State := Tunnel (State)
END
END
.sp 0.5
/* extend the path */
.sp 0.5
WHILE Symbol \(!= EndOfInput DO
NewState := Record (Index, NoState)
IF NewState = NoState THEN
NewState := CreateState ()
Semantics (NewState) := \(O/
Tunnel (NewState) := NoState
Record (Index, NoState) := NewState
END
Control (PreviousState, Symbol) := NewState
StartSet (NewState) := StartSet (NewState) \(cu { StartState }
PreviousState := NewState
Symbol := NextSymbol ()
Index +:= 1
END
.sp 0.5
/* process new final state */
.sp 0.5
FinalStates := FinalStates \(cu { PreviousState }
Semantics (PreviousState) := Semantics (PreviousState) \(cu Semantics (RE)
END
.)l
.sp 0.5
.\}
.sh 1 "Experimental Results"
.pp
The presented method has been used in a scanner generator called
.i Rex
(Regular EXpression tool)
which is implemented by a 5,500 line Modula-2\*([<\*([[Wir85\*(]]\*(>] program.
It is able to generate
scanners in the languages C and Modula-2. The generated scanners are
table-driven implementations of a tunnel
automaton. In the following we compare the time to generate a scanner as
well as the speed of the generated scanners for LEX\*([<\*([[Les75\*(]]\*(>], flex\*([<\*([[Pax88\*(]]\*(>],
and Rex\*([<\*([[Gro87a\*(],Gro88\*(]]\*(>].
Flex is a rewrite of LEX intended to generate lexical analysers much faster
and the analysers use smaller tables and run faster.
.pp
To compare the scanner generators we used 4 versions of a Modula-2 scanner
specification (see Appendix 2 for an example written in the input language
of Rex). The versions differ in
the number of constant REs as shown in Table 1.
Version 1 is incomplete, version 2 omits keywords, version 3
has keywords, and version 4 has lower as well as upper case keywords.
The times are given in seconds measured on a MC 68020 processor.
The optimization of flex can be controlled by options: usually -cem results
in the smallest tables and -cf in the fastest analyser.
.(z C
.\" vs 12
\fBTable 1:\fP Comparison of Scanner Generators
.sp 0.5
.TS
tab (;) box center;
l s | c c c c
l s | r r r r
l s | r r r r
l s | r r r r
l s | r r r r
l s | r r r r
l s | r r r r
l l | r r r r.
;Spec. 1;Spec. 2;Spec. 3;Spec. 4
_
# of non-constant REs ; 10; 10; 10; 10
# of constant REs ; 2; 29; 69;109
total # of REs ; 12; 39; 79;119
_
# of states for non-constant REs ; 40; 40; 40; 40
# of states for constant REs ; 0; 26;181;336
total # of states (generated by Rex); 40; 66;221;376
_
generation time using;LEX ;3.14;6.71;73.71;215.08
[seconds] ;flex -cem;0.69;0.78; 2.01; 3.81
.\" ;flex -cfe;0.69;1.12; 3.81; 8.60
;flex -cf ;1.39;2.35; 7.16; 12.16
;Rex ;3.56;3.76; 4.91; 6.16
_
table size using ;LEX ; 2,996; 4,708;39,156;67,384
[bytes] ;flex -cem; 1,116; 1,376; 2,868; 4,592
.\" ;flex -cfe; 2,016; 5,360;24,976;59,524
;flex -cf ;10,744;17,424;57,260;97,096
;Rex ; 3,114; 3,218; 4,366; 5,530
_
scanner size using ;LEX ; 5,464; 8,044;43,756;73,264
[bytes] ;flex -cem;14,240;15,368;18,140;21,144
.\" ;flex -cfe; 6,748;10,960;31,856;67,684
;flex -cf ;15,452;23,000;64,116;105,232
;Rex ; 8,456; 8,884;11,200;13,536
.TE
.)z
.pp
LEX performs well for small specifications but it seems to use a quadratic
or exponential algorithm for all the REs. This
leads to long generation times and large tables for large specifications
(containing e. g. many keywords).
.pp
Flex is quite an improvement compared to LEX especially in terms of
generation time. The sizes of the tables and the generated scanners depend on
the optimization flags: -cem reduces the sizes drastically but -cf yields
sizes even larger than LEX.
.pp
The timings of Rex demonstrate its linear behaviour. In general the
generation times are not quite as small as those of flex except for
specification 4.
The expensive algorithms
for subset construction, state minimization, and table compression
are executed for 40 states, only. An arbitrary number of
constant REs like keywords can be added in a small, linear growing time.
Although the generated tables are larger than those of flex the total sizes
of the scanners (including the tables) are smaller.
Compared to LEX
the correct scanner specification 3 is processed nearly 20 times faster by
Rex, the size of the table is 10 times smaller, and
the scanner is 4 times smaller.
.(z C
.\" vs 12
\fBTable 2:\fP Comparison of Generated Scanners
.sp 0.5
.TS
tab (;) box center;
l | c s s s | c s
l | c s s s | c s
l | r r r r | r r.
generator;with hashing of identifiers and;without hashing and
;number conversion;number conversion
_
;table size;scanner size;time;speed;time;speed
;[bytes];[bytes];[sec.];[lines/min.];[sec.];[lines/min.]
_
LEX ;39,156;43,756;7.21; 34,700;6.88; 36,400
flex -cem; 2,868;18,140;3.99; 62,700;3.69; 67,800
.\"flex -cfe;24,976;31,856;2.60; 96,300;2.29;109,300
flex -cf ;57,260;64,116;2.12;118,000;1.80;139,000
Rex -m ; 4,306;13,672;3.00; 83,400;2.22;112,700
Rex -c ; 4,366;11,200;1.77;141,400;1.37;182,700
.TE
.)z
.pp
To compare the generated scanners (Table 2) we used a big Modula-2 program
as input whose characteristics are as follows:
# of lines: 4,171, # of characters: 100,268, # of tokens: 16,948.
The timings include input from file, scanning and hashing of identifiers.
The Rex options -m and -c determine the target languages Modula-2 and C.
Compared to LEX flex yields a speed-up of 1.8 with options -cem and a
speed-up between 3.4 and 3.8 with options -cf. The C version of Rex reaches
a speed-up between 4 and 5. This speed is reached with analysers that are
considerable smaller than flex generated ones.
The figures show that a tunnel automaton can be implemented efficiently:
Scanners generated by Rex are 4 to 5 times faster than scanners generated by
LEX. More details can be found in reference\*([[Gro87b\*(]].
.(z C
.\" vs 12
\fBTable 3:\fP Comparison of Rex-Generated Scanners
.sp 0.5
.TS
tab (;) box center;
l | c s s s | c s
l | r r r r | r r.
language;generator data;scanner data
_
;static size;dyn. memory;total memory;time;table size;scanner size
;[bytes];[bytes];[bytes];[sec.];[bytes];[bytes]
_
Pascal ;102,970;238,464;341,434; 87.40;4,426;13,128
Modula ;102,970;201,344;304,314; 4.93;4,306;13,692
Oberon ;102,970;201,344;304,314; 5.71;5,122;14,284
C ;102,970;201,344;304,314; 7.24;5,702;13,296
Ada ;102,970;441,984;544,954;300.63;6,222;17,450
.TE
.)z
.pp
Table 3 compares scanners for different languages generated by Rex.
The sizes of the tables and the complete scanners all lie in the same range
which is relatively small. Only the generation times for Pascal and Ada are
exceptionally long. The reason is that these languages allow keywords to be
written with any letters no matter if upper or lower case. Therefore
keywords are no longer constant REs and can not be processed with the
presented linear algorithm.
.sh 1 Conclusion
.pp
The presented method allows the conversion of a NFA given by a set of REs to a DFA
in practically linear time. This holds under the assumption that only a few
non-constant but many constant REs have to be processed which is true in
many practical cases. Compared to LEX we gained a speed-up of up to 20.
Compared to flex which similarly improves the generation times the
generated scanners are smaller and faster.
Not only is the generation time reduced drastically
but also the space requirement during generation of
the scanner. The generator has to perform the subset construction,
state minimization, and table compression only for a
few states. Therefore the space needed by those algorithms is small. The
presented method directly constructs a space efficient representation of the
scanner control table which is used in combination with the comb-vector
technique\*([<\*([[ASU86\*(]]\*(>].
The method allows the generation of lexical analysers in a small amount of
time.
.\" Therefore scanner generators can become tools for more purposes than
.\" just compiler construction.
The generated scanners are powerful and
efficient enough to be used in production compilers.
Finally, scanner generators could be applied to a broader area than
just compiler construction.
.fi
.sz 12
.[]
.[-
.ds [F AhC75
.ds [A A\*(p]\*(a]V\*(p] Aho
.as [A \*(n]M\*(p]\*(a]J\*(p] Corasick
.ds [T Efficient String Matching: An Aid to Bibliographic Search
.nr [P 1
.ds [P 333-340
.ds [J Comm. ACM
.ds [V 18
.ds [D June 1975
.ds [N 6
.][
.[-
.ds [F ASU86
.ds [A A\*(p]\*(a]V\*(p] Aho
.as [A \*(c]R\*(p] Sethi
.as [A \*(m]J\*(p]\*(a]D\*(p] Ullman
.ds [T Compilers: Principles, Techniques, and Tools
.ds [I Addison Wesley
.ds [C Reading, M\&A
.ds [D 1986
.][
.[-
.ds [F Gro87a
.ds [A J\*(p] Grosch
.ds [T Rex - A Scanner Generator
.ds [I GMD Forschungsstelle an der Universit\\*:at Karlsruhe
.ds [R Compiler Generation Report No. 5
.ds [N 5
.ds [D Dec. 1987
.][
.[-
.ds [F Gro87b
.ds [A J\*(p] Grosch
.ds [T An Efficient Table-Driven Scanner
.ds [I GMD Forschungsstelle an der Universit\\*:at Karlsruhe
.ds [R Compiler Generation Report No. 1
.ds [N 1
.ds [D May 1987
.][
.[-
.ds [F Gro88
.ds [A J\*(p] Grosch
.ds [T Selected Examples of Scanner Specifications
.ds [I GMD Forschungsstelle an der Universit\\*:at Karlsruhe
.ds [R Compiler Generation Report No. 7
.ds [N 7
.ds [D Mar. 1988
.][
.[-
.ds [F Heu86
.ds [A V\*(p]\*(a]P\*(p] Heuring
.ds [T The Automatic Generation of Fast Lexical Analysers
.nr [P 1
.ds [P 801-808
.ds [J Software\(emPractice & Experience
.ds [V 16
.ds [N 9
.ds [D Sep. 1986
.][
.[-
.ds [F HoL87
.ds [A R\*(p]\*(a]N\*(p] Horspool
.as [A \*(n]M\*(p]\*(a]R\*(p] Levy
.ds [T Mkscan - An Interactive Scanner Generator
.nr [P 1
.ds [P 369-378
.ds [J Software\(emPractice & Experience
.ds [V 17
.ds [N 6
.ds [D June 1987
.][
.[-
.ds [F Les75
.ds [A M\*(p]\*(a]E\*(p] Lesk
.ds [T LEX \(em A Lexical Analyzer Generator
.ds [R Computing Science Technical Report 39
.ds [I Bell Telephone Laboratories
.ds [C Murray Hill, NJ
.ds [D 1975
.][
.[-
.ds [F M\*os86
.ds [A H\*(p] M\\*:ossenb\\*:ock
.ds [T Alex - A Simple and Efficient Scanner Generator
.ds [J SI\&GPLAN Notices
.ds [V 21
.ds [N 12
.nr [P 1
.ds [P 139-148
.ds [D Dec. 1986
.][
.[-
.ds [F Pax88
.ds [A V\*(p] Paxson
.ds [T Flex - Manual Pages
.ds [I Public Domain Software
.ds [D 1988
.][
.[-
.ds [F WaG84
.ds [A W\*(p]\*(a]M\*(p] Waite
.as [A \*(n]G\*(p] Goos
.ds [T Compiler Construction
.ds [I Springer Verlag
.ds [C New York, NY
.ds [D 1984
.][
.[-
.ds [F Wai86
.ds [A W\*(p]\*(a]M\*(p] Waite
.ds [T The Cost of Lexical Analysis
.nr [P 1
.ds [P 473-488
.ds [J Software\(emPractice & Experience
.ds [V 16
.ds [N 5
.ds [D May 1986
.][
.[-
.ds [F Wir85
.ds [A N\*(p] Wirth
.ds [T Programming in Modula-2
.nr [P 0
.ds [P 202
.ds [I Springer Verlag
.ds [C Heidelberg
.ds [D 1985
.ds [O Third Edition
.][
.bp
.uh "Appendix 1"
.sp
.(l L
.vs 12
\fBAlgorithm 4\fR: ambiguous states of a tunnel automaton (with one start state)
.FT
BEGIN
.sp 0.5
/* transition function using the tunnel function */
.sp 0.5
PROCEDURE NextState (State, Symbol)
BEGIN
LOOP
IF Control (State, Symbol) \(!= NoState THEN
RETURN Control (State, Symbol)
ELSE
State := Tunnel (State)
IF State = NoState THEN RETURN NoState END
END
END
END
.sp 0.5
/* compute the number of predecessors */
.sp 0.5
FORALL State \(mo StateSet DO
PredCount (State) := 0
END
.sp 0.5
FORALL State \(mo StateSet DO
FORALL Symbol \(mo Vocabulary DO
PredCount (NextState (State, Symbol)) +:= 1
END
END
.sp 0.5
/* iteratively remove states with 1 predecessor */
.sp 0.5
AmbiguousStates := StateSet - { NoState }
UnambiguousStates := { StartState }
.sp 0.5
WHILE UnambiguousStates \(!= \(O/ DO
State := SELECT UnambiguousStates
UnambiguousStates -:= { State }
AmbiguousStates -:= { State }
FORALL Symbol \(mo Vocabulary DO
Successor := NextState (State, Symbol)
IF PredCount (Successor) = 1 THEN
UnambiguousStates \(cu:= { Successor }
END
END
END
END
.)l
.if 0 \{\
.bp
.uh "Appendix 2"
.sp
.(l L
.vs 12
\fBAlgorithm 5\fR: ambiguous states (several start states)
.FT
BEGIN
.sp 0.5
/* compute the number of predecessors */
.sp 0.5
FORALL State \(mo StateSet DO /* initialize */
PredCount (State) := 0
END
.sp 0.5
FORALL StartState \(mo StartStateSet DO /* start states */
FORALL State \(mo StateSet DO
PredCount2 (State) := 0
END
FORALL Symbol \(mo Vocabulary DO
PredCount2 (Control (StartState, Symbol)) +:= 1
END
FORALL State \(mo StateSet DO
PredCount (State) := Max (PredCount (State), PredCount2 (State))
END
END
.sp 0.5
FORALL State \(mo StateSet - StartStateSet DO /* other states */
FORALL Symbol \(mo Vocabulary DO
PredCount (Control (State, Symbol)) +:= 1
END
END
.sp 0.5
/* iteratively remove states with 1 predecessor */
.sp 0.5
AmbiguousStates := StateSet - { NoState }
UnambiguousStates := StartStateSet
.sp 0.5
WHILE UnambiguousStates \(!= \(O/ DO
State := SELECT UnambiguousStates
UnambiguousStates -:= { State }
AmbiguousStates -:= { State }
FORALL Symbol \(mo Vocabulary DO
Successor := Control (State, Symbol)
IF PredCount (Successor) = 1 THEN
UnambiguousStates \(cu:= { Successor }
END
END
END
END
.)l
.\}
.bp
.uh "Appendix 2"
.lp
.sp 0.5
An example of a scanner specification for Modula-2 in Rex syntax:
.sp
.(l L
.vs 12
.FT
GLOBAL { VAR NestingLevel: CARDINAL; }
.sp 0.5
BEGIN { NestingLevel := 0; }
.sp 0.5
DEFAULT { Error ("illegal character:"); yyEcho; }
.sp 0.5
EOF { IF yyStartState = comment THEN Error ("unclosed comment"); END; }
.sp 0.5
DEFINE
.sp 0.5
digit = {0-9} .
letter = {a-z A-Z} .
cmt = - {*(\\t\\n} .
.sp 0.5
START comment
.sp 0.5
RULES
.sp 0.5
"(*" : {INC (NestingLevel); yyStart (comment);}
#comment# "*)" : {DEC (NestingLevel); IF NestingLevel = 0 THEN yyStart (STD); END;}
#comment# "(" | "*" | cmt + : {}
.sp 0.5
#STD# digit + ,
#STD# digit + / ".." : {RETURN 1;}
#STD# {0-7} + B : {RETURN 2;}
#STD# {0-7} + C : {RETURN 3;}
#STD# digit {0-9 A-F} * H : {RETURN 4;}
#STD# digit + "." digit * (E {+\\-} ? digit +) ? : {RETURN 5;}
.sp 0.5
#STD# ' - {\\n'} * ' |
\\" - {\\n"} * \\" : {RETURN 6;}
.sp 0.5
#STD# ":" : {RETURN 7;}
#STD# "=" : {RETURN 8;}
#STD# ":=" : {RETURN 9;}
\&...
.sp 0.5
#STD# AND : {RETURN 34;}
#STD# ARRAY : {RETURN 35;}
#STD# BEGIN : {RETURN 36;}
\&...
.sp 0.5
#STD# letter (letter | digit) * : {RETURN 74;}
.)l
.if 0 \{\
.(l L
.vs 12
.FT
GLOBAL { VAR NestingLevel: CARDINAL; }
.sp 0.5
BEGIN { NestingLevel := 0; }
.sp 0.5
EOF { IF yyStartState = comment THEN Error ("unclosed comment"); END; }
.sp 0.5
DEFINE
.sp 0.5
digit = {0-9} .
letter = {a-z A-Z} .
cmt = - {*(\\t\\n} .
.sp 0.5
START comment
.sp 0.5
RULES
.sp 0.5
"(*" : {INC (NestingLevel); yyStart (comment);}
#comment# "*)" : {DEC (NestingLevel); IF NestingLevel = 0 THEN yyStart (STD); END;}
#comment# "(" | "*" | cmt + : {}
.sp 0.5
#STD# digit + ,
#STD# digit + / ".." : {RETURN 1;}
#STD# {0-7} + B : {RETURN 2;}
#STD# {0-7} + C : {RETURN 3;}
#STD# digit {0-9 A-F} * H : {RETURN 4;}
#STD# digit + "." digit * (E {+\\-} ? digit +) ? : {RETURN 5;}
.sp 0.5
#STD# ' - {\\n'} * ' |
\\" - {\\n"} * \\" : {RETURN 6;}
.sp 0.5
#STD# "#" : {RETURN 7;}
#STD# "&" : {RETURN 8;}
#STD# "(" : {RETURN 9;}
#STD# ")" : {RETURN 10;}
#STD# "*" : {RETURN 11;}
#STD# "+" : {RETURN 12;}
#STD# "," : {RETURN 13;}
#STD# "-" : {RETURN 14;}
#STD# "." : {RETURN 15;}
#STD# ".." : {RETURN 16;}
#STD# "/" : {RETURN 17;}
#STD# ":" : {RETURN 18;}
#STD# ":=" : {RETURN 19;}
#STD# ";" : {RETURN 20;}
#STD# "<" : {RETURN 21;}
#STD# "<=" : {RETURN 22;}
#STD# "<>" : {RETURN 23;}
#STD# "=" : {RETURN 24;}
#STD# ">" : {RETURN 25;}
#STD# ">=" : {RETURN 26;}
#STD# "[" : {RETURN 27;}
#STD# "]" : {RETURN 28;}
#STD# "^" : {RETURN 29;}
#STD# "{" : {RETURN 30;}
#STD# "|" : {RETURN 31;}
#STD# "}" : {RETURN 32;}
#STD# "~" : {RETURN 33;}
.sp 0.5
#STD# AND : {RETURN 34;}
#STD# ARRAY : {RETURN 35;}
#STD# BEGIN : {RETURN 36;}
#STD# BY : {RETURN 37;}
#STD# CASE : {RETURN 38;}
#STD# CONST : {RETURN 39;}
#STD# DEFINITION : {RETURN 40;}
#STD# DIV : {RETURN 41;}
#STD# DO : {RETURN 42;}
#STD# ELSE : {RETURN 43;}
#STD# ELSIF : {RETURN 44;}
#STD# END : {RETURN 45;}
#STD# EXIT : {RETURN 46;}
#STD# EXPORT : {RETURN 47;}
#STD# FOR : {RETURN 48;}
#STD# FROM : {RETURN 49;}
#STD# IF : {RETURN 50;}
#STD# IMPLEMENTATION : {RETURN 51;}
#STD# IMPORT : {RETURN 52;}
#STD# IN : {RETURN 53;}
#STD# LOOP : {RETURN 54;}
#STD# MOD : {RETURN 55;}
#STD# MODULE : {RETURN 56;}
#STD# \\NOT : {RETURN 57;}
#STD# OF : {RETURN 58;}
#STD# OR : {RETURN 59;}
#STD# POINTER : {RETURN 60;}
#STD# PROCEDURE : {RETURN 61;}
#STD# QUALIFIED : {RETURN 62;}
#STD# RECORD : {RETURN 63;}
#STD# REPEAT : {RETURN 64;}
#STD# RETURN : {RETURN 65;}
#STD# SET : {RETURN 66;}
#STD# THEN : {RETURN 67;}
#STD# TO : {RETURN 68;}
#STD# TYPE : {RETURN 69;}
#STD# UNTIL : {RETURN 70;}
#STD# VAR : {RETURN 71;}
#STD# WHILE : {RETURN 72;}
#STD# WITH : {RETURN 73;}
.sp 0.5
#STD# letter (letter | digit) * : {RETURN 74;}
.sp 0.5
#STD# ANY : {Error ("illegal character");}
.)l
.(l L
.vs 12
.FT
digit [0-9]
letter [a-zA-Z]
CMT [^*(]
STR1 [\\t !-&(-~]
STR2 [\\t !#-~]
%Start comment
%%
int NestingLevel = 0;
"(*"{CMT}* {NestingLevel ++; BEGIN comment;}
<comment>"*)" {NestingLevel --; if (NestingLevel == 0) BEGIN 0;}
<comment>"(" |
<comment>"*" |
<comment>{CMT}+ ;
" "* ;
\\n ;
\\t ;
{digit}+ |
{digit}+/".." return 1;
[0-7]+B return 2;
[0-7]+C return 3;
{digit}[0-9A-F]*H return 4;
{digit}+"."{digit}*(E[+-]?{digit}+)? return 5;
'{STR1}*' |
\\"{STR2}*\\" return 6;
"#" return 7;
"&" return 8;
"(" return 9;
")" return 10;
"*" return 11;
"+" return 12;
"," return 13;
"-" return 14;
"." return 15;
".." return 16;
"/" return 17;
":" return 18;
":=" return 19;
";" return 20;
"<" return 21;
"<=" return 22;
"<>" return 23;
"=" return 24;
">" return 25;
">=" return 26;
"[" return 27;
"]" return 28;
"^" return 29;
"{" return 30;
"|" return 31;
"}" return 32;
"~" return 33;
AND return 34;
ARRAY return 35;
BEGIN return 36;
BY return 37;
CASE return 38;
CONST return 39;
DEFINITION return 40;
DIV return 41;
DO return 42;
ELSE return 43;
ELSIF return 44;
END return 45;
EXIT return 46;
EXPORT return 47;
FOR return 48;
FROM return 49;
IF return 50;
IMPLEMENTATION return 51;
IMPORT return 52;
IN return 53;
LOOP return 54;
MOD return 55;
MODULE return 56;
NOT return 57;
OF return 58;
OR return 59;
POINTER return 60;
PROCEDURE return 61;
QUALIFIED return 62;
RECORD return 63;
REPEAT return 64;
RETURN return 65;
SET return 66;
THEN return 67;
TO return 68;
TYPE return 69;
UNTIL return 70;
VAR return 71;
WHILE return 72;
WITH return 73;
{letter}({letter}|{digit})* return 74;
.)l
.\}
.bp 1
.lp
.b Contents
.sp
.xp